This notebook consolidates the quality control (QC) diagnostics for the CCLE proteomic assays curated in the PharmacoSet, spanning both the Gygi TMT mass spectrometry release and the TCPA RPPA500 panel. It reproduces the previously published QC checks, adds side-by-side summaries, and extends the analysis with additional diagnostics frequently requested by collaborating laboratories. The emphasis is on practical indicators that external researchers can use to rapidly assess assay depth, missingness, variance structure, and replicate consistency.
project_root <- here::here()
knitr::opts_knit$set(root.dir = project_root)
pset_path <- file.path(project_root, "results", "CCLE_PharmacoSet.RDS")
if (!file.exists(pset_path)) {
stop("PharmacoSet file not found: ", pset_path)
}
pset <- readRDS(pset_path)
pset
## <PharmacoSet>
## Name: CCLE_(2019)
## Date Created: Wed Nov 19 13:03:16 2025
## Number of samples: 1457
## Molecular profiles: <MultiAssayExperiment>
## ExperimentList class object of length 13:
## [1] rnaseq.transcripts_tpm : RangedSummarizedExperiment with 57820 rows and 1018 columns
## [2] rnaseq.genes_tpm : RangedSummarizedExperiment with 196520 rows and 1018 columns
## [3] rnaseq.genes_counts : RangedSummarizedExperiment with 56202 rows and 1018 columns
## [4] rnaseq.genes_rpkm : RangedSummarizedExperiment with 56202 rows and 1018 columns
## [5] cnv.genes : RangedSummarizedExperiment with 17730 rows and 746 columns
## [6] mut.genes : RangedSummarizedExperiment with 19627 rows and 1351 columns
## [7] proteomics.rppa : SummarizedExperiment with 447 rows and 877 columns
## [8] proteomics.massspec : SummarizedExperiment with 12755 rows and 374 columns
## [9] rrbs.tss_1kb : RangedSummarizedExperiment with 21337 rows and 842 columns
## [10] rrbs.tss_cpg_clusters : RangedSummarizedExperiment with 56145 rows and 842 columns
## [11] rrbs.cgi_cpg_clusters : RangedSummarizedExperiment with 81037 rows and 842 columns
## [12] rrbs.enhancer_cpg_clusters : RangedSummarizedExperiment with 1355 rows and 842 columns
## [13] metabolomics.lcms : SummarizedExperiment with 225 rows and 927 columns
## Treatment response: <TreatmentResponseExperiment>
## dim: 184 493
## assays(2): sensitivity profiles
## rownames(184): 17AAG:0.0025 17AAG:0.008 ... TOPOTECAN:2.53 TOPOTECAN:8
## rowData(2): treatmentid dose
## colnames(493): 1321N1 22Rv1 42-MG-BA ... YKG-1 ZR-75-1 ZR-75-30
## colData(1): sampleid
## metadata(5): data_source filename annotation date sessionInfo
assay_list <- molecularProfiles(pset)
mass_spec <- assay_list[["proteomics.massspec"]]
rppa <- assay_list[["proteomics.rppa"]]
rna_tpm <- assay_list[["rnaseq.genes_tpm"]]
if (is.null(mass_spec) || is.null(rppa)) {
stop(
"Expected proteomics assays (mass spec and RPPA) were not found in the PharmacoSet."
)
}
cell_meta <- cellInfo(pset) |>
as_tibble(rownames = "cellid")
mass_spec_sample_meta <- SummarizedExperiment::colData(mass_spec) |>
as.data.frame() |>
as_tibble()
if (!"sampleid" %in% names(mass_spec_sample_meta)) {
mass_spec_sample_meta <- mass_spec_sample_meta |>
mutate(sampleid = rownames(SummarizedExperiment::colData(mass_spec)))
}
mass_spec_sample_meta <- mass_spec_sample_meta |>
mutate(
dataset_sample_id = coalesce(dataset_sample_id, sampleid),
sampleid = as.character(sampleid)
)
mass_spec_feature_meta <- SummarizedExperiment::rowData(mass_spec) |>
as.data.frame() |>
as_tibble()
if (!"feature_id" %in% names(mass_spec_feature_meta)) {
mass_spec_feature_meta <- mass_spec_feature_meta |>
mutate(feature_id = rownames(SummarizedExperiment::rowData(mass_spec)))
}
tenplex_cols <- mass_spec_feature_meta |>
select(starts_with("TenPx")) |>
colnames()
mass_spec_feature_meta <- mass_spec_feature_meta |>
mutate(
total_peptides = if (length(tenplex_cols) > 0) {
rowSums(across(all_of(tenplex_cols)), na.rm = TRUE)
} else {
NA_real_
}
)
rppa_sample_meta <- SummarizedExperiment::colData(rppa) |>
as.data.frame() |>
as_tibble()
if (!"sampleid" %in% names(rppa_sample_meta)) {
rppa_sample_meta <- rppa_sample_meta |>
mutate(sampleid = rownames(SummarizedExperiment::colData(rppa)))
}
rppa_sample_meta <- rppa_sample_meta |>
mutate(
dataset_sample_id = coalesce(dataset_sample_id, sampleid),
sampleid = as.character(sampleid)
)
if (is.null(tissue_color_map) || !length(tissue_color_map)) {
ms_tissue_all <- as.character(mass_spec_sample_meta$tissue)
ms_tissue_all <- ifelse(
is.na(ms_tissue_all) | ms_tissue_all == "", "unspecified", ms_tissue_all
)
rppa_tissue_all <- as.character(rppa_sample_meta$tissue)
rppa_tissue_all <- ifelse(
is.na(rppa_tissue_all) | rppa_tissue_all == "", "unspecified", rppa_tissue_all
)
combined_tissues <- c(ms_tissue_all, rppa_tissue_all)
combined_tissues <- combined_tissues[!is.na(combined_tissues)]
tissue_levels_consistent <- sort(unique(combined_tissues))
tissue_color_map <- setNames(
rep(palette_extended, length.out = length(tissue_levels_consistent)),
tissue_levels_consistent
)
}
rppa_feature_meta <- SummarizedExperiment::rowData(rppa) |>
as.data.frame() |>
as_tibble()
if (!"feature_id" %in% names(rppa_feature_meta)) {
rppa_feature_meta <- rppa_feature_meta |>
mutate(feature_id = rownames(SummarizedExperiment::rowData(rppa)))
}
mass_spec_exprs <- SummarizedExperiment::assay(mass_spec, "exprs")
rppa_exprs <- SummarizedExperiment::assay(rppa, "exprs")
assay_overview <- tibble(
assay = names(assay_list),
features = purrr::map_int(assay_list, nrow),
samples = purrr::map_int(assay_list, ncol)
) |>
arrange(desc(features))
DT::datatable(
assay_overview,
colnames = c("Assay", "Features", "Samples"),
caption = "Available molecular assays in the CCLE PharmacoSet.",
options = list(dom = 't')
)
This section verifies that the proteomic profiles are correctly linked to cell line identifiers. * Dataset Mismatch: Should be 0. Non-zero indicates potential sample swaps or naming errors. * DepMap Missing: Should be 0. Indicates samples that cannot be linked to the Broad DepMap metadata. * Multi-measurement: Indicates cell lines with biological replicates. These are usually averaged in the final profile.
mass_spec_samples <- mass_spec_sample_meta |>
mutate(across(everything(), as.character)) |>
select(
sampleid,
depmap_id,
cell_line,
tissue,
dataset_sample_id,
n_measurements,
plex_id
) |>
left_join(
cell_meta |>
select(cellid, CCLE.sampleid, CCLE.name, CCLE.site_Primary, CCLE.type),
by = c("sampleid" = "cellid")
) |>
mutate(
ccle_sampleid = coalesce(CCLE.sampleid, sampleid),
dataset_match = if_else(
is.na(dataset_sample_id) | is.na(ccle_sampleid),
FALSE,
str_detect(dataset_sample_id, fixed(ccle_sampleid))
),
tissue = if_else(is.na(tissue) | tissue == "", "unspecified", tissue)
)
alignment_summary <- mass_spec_samples |>
summarise(
samples = n(),
depmap_missing = sum(is.na(depmap_id)),
dataset_mismatch = sum(!dataset_match),
multi_measurement = sum(as.integer(n_measurements) > 1)
)
DT::datatable(
alignment_summary,
caption = "Mass spectrometry sample alignment diagnostics.",
options = list(dom = 't')
)
All 374 proteomic profiles resolved to CCLE samples and to DepMap
identifiers following curation. Three records retain multiple Gygi
measurements (multi_measurement), reflecting biological
replicates that were averaged (mean) into the final profile while
keeping provenance in the metadata.
tissue_counts <- mass_spec_samples |>
count(tissue, name = "n") |>
arrange(desc(n)) |>
mutate(share = n / sum(n))
tissue_cols_mass_bar <- tissue_palette(tissue_counts$tissue)
ggplot(tissue_counts, aes(x = reorder(tissue, n), y = n, fill = tissue)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "Mass spectrometry sample composition by tissue",
x = "Tissue",
y = "Samples"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
scale_fill_manual(values = tissue_cols_mass_bar, na.translate = TRUE) +
theme_minimal(base_size = 12)
protein_group_count <- nrow(mass_spec_exprs)
peptide_summary <- mass_spec_feature_meta |>
summarise(
proteins_with_peptides = sum(!is.na(total_peptides) & total_peptides > 0),
median_peptides = median(total_peptides, na.rm = TRUE),
q1_peptides = quantile(total_peptides, probs = 0.25, na.rm = TRUE),
q3_peptides = quantile(total_peptides, probs = 0.75, na.rm = TRUE)
)
depth_summary <- tibble(
metric = c(
"Protein groups quantified",
"Entries supported by ≥1 peptide",
"Median peptides per protein",
"Peptide IQR (Q1–Q3)"
),
value = c(
as.character(protein_group_count),
as.character(peptide_summary$proteins_with_peptides),
as.character(round(peptide_summary$median_peptides, 1)),
paste(
round(peptide_summary$q1_peptides, 1),
round(peptide_summary$q3_peptides, 1),
sep = " – "
)
)
)
DT::datatable(
depth_summary,
colnames = c("Metric", "Value"),
caption = "Mass spectrometry quantification depth.",
options = list(dom = 't')
)
The Gygi release quantifies 12,755 protein groups. Peptide support is high (median ~40 peptides per protein group), aligning with expectations for deep TMT profiling of cell line digests.
plex_summary <- mass_spec_samples |>
mutate(
plex_id = if_else(is.na(plex_id) | plex_id == "", "unspecified", plex_id),
n_measurements = as.integer(n_measurements)
) |>
count(plex_id, name = "samples") |>
arrange(desc(samples))
max_samples_per_plex <- max(plex_summary$samples, na.rm = TRUE)
ggplot(plex_summary, aes(x = reorder(plex_id, samples), y = samples)) +
geom_col(fill = col_mass_plex) +
coord_flip() +
labs(
title = "Ten-plex composition",
x = "TMT plex",
y = "Samples"
) +
scale_y_continuous(
limits = c(0, 10),
breaks = seq(0, 10, by = 2),
expand = expansion(mult = c(0, 0.05))
) +
theme_minimal(base_size = 12)
The last three TMT plex labels correspond to the merged measurements; otherwise, all plexes would have 9 count (ten channels with one for reference = nine) except TenPx07. Likely experimental inconsistency, not an issue.
mass_spec_samples |>
count(n_measurements, name = "samples") |>
arrange(as.integer(n_measurements)) |>
DT::datatable(
colnames = c("Merged Gygi measurements", "Samples"),
caption = "Number of Gygi quantifications contributing to each CCLE profile.",
options = list(dom = 't')
)
The tally confirms that exactly three CCLE profiles were built from two Gygi measurements each; all remaining 371 profiles aggregate a single quantification run.
NA
values. Downstream analyses (e.g., PCA) typically require imputation
(e.g., KNN or min-prob).The missing value survey highlights whether particular plexes or samples require imputation or exclusion before downstream modeling.
if (!exists("compute_missingness")) {
compute_missingness <- function(mat) {
sample_missing <- colMeans(is.na(mat)) * 100
feature_missing <- rowMeans(is.na(mat)) * 100
list(
sample = tibble(
sampleid = names(sample_missing),
missing_pct = unname(sample_missing)
),
feature = tibble(
feature_id = rownames(mat),
missing_pct = feature_missing
),
dataset = tibble(
missing_pct = mean(is.na(mat)) * 100
)
)
}
}
ms_missing <- compute_missingness(mass_spec_exprs)
sample_missing_meta <- ms_missing$sample |>
left_join(mass_spec_samples, by = "sampleid") |>
mutate(
plex_id = if_else(is.na(plex_id) | plex_id == "", "unspecified", plex_id)
)
sample_missing_meta_expanded <- sample_missing_meta |>
tidyr::separate_rows(plex_id, sep = "\\|")
sample_hist <- hist(
ms_missing$sample$missing_pct,
breaks = seq(0, 100, length.out = 31),
plot = FALSE
)
sample_y_max <- max(sample_hist$counts, 1)
sample_missing_plot <- sample_missing_meta |>
ggplot(aes(x = missing_pct)) +
geom_histogram(
fill = col_mass_missing_sample,
color = "white",
bins = 30,
boundary = 0,
closed = "left"
) +
labs(
title = "Mass spec: per-sample missing values",
x = "Missing values (%)",
y = "Samples"
) +
scale_x_continuous(limits = c(0, 100), expand = expansion(mult = c(0, 0))) +
scale_y_continuous(
limits = c(0, sample_y_max),
expand = expansion(mult = c(0, 0.05)),
labels = scales::number_format(accuracy = 1)
) +
theme_minimal(base_size = 12)
print(sample_missing_plot)
plex_missing_plot <- sample_missing_meta_expanded |>
mutate(
plex_id = forcats::fct_reorder(
plex_id,
missing_pct,
.fun = median,
.desc = TRUE
)
) |>
ggplot(aes(x = plex_id, y = missing_pct)) +
geom_boxplot(
fill = col_mass_missing_sample,
color = "white",
outlier.alpha = 0.25
) +
coord_flip() +
labs(
title = "Per-plex distribution of sample missingness",
x = "TMT plex",
y = "Sample missing values (%)"
) +
theme_minimal(base_size = 12)
print(plex_missing_plot)
plex_missing_summary <- sample_missing_meta_expanded |>
group_by(plex_id) |>
summarise(
samples = n(),
missing_pct = round(first(missing_pct), 2),
distinct_missing = dplyr::n_distinct(round(missing_pct, 4)),
.groups = "drop"
) |>
arrange(desc(missing_pct), desc(samples))
if (any(plex_missing_summary$distinct_missing > 1)) {
warning("Detected plexes with heterogeneous missingness values.")
}
plex_missing_summary <- plex_missing_summary |>
select(-distinct_missing)
DT::datatable(
plex_missing_summary,
colnames = c("Plex", "Samples", "Missing (%)"),
caption = "Per-plex sample missingness summary (all TMT batches; counts expanded for replicated runs).",
options = list(pageLength = 10, scrollX = TRUE)
)
feature_hist <- hist(
ms_missing$feature$missing_pct,
breaks = seq(0, 100, length.out = 31),
plot = FALSE
)
feature_y_max <- max(feature_hist$counts, 1)
feature_missing_plot <- ms_missing$feature |>
ggplot(aes(x = missing_pct)) +
geom_histogram(
fill = col_mass_missing_feature,
color = "white",
bins = 30,
boundary = 0,
closed = "left"
) +
labs(
title = "Mass spec: per-protein missing values",
x = "Missing values (%)",
y = "Protein groups"
) +
scale_x_continuous(limits = c(0, 100), expand = expansion(mult = c(0, 0))) +
scale_y_continuous(
limits = c(0, feature_y_max),
expand = expansion(mult = c(0, 0.05)),
labels = scales::number_format(accuracy = 1)
) +
theme_minimal(base_size = 12)
print(feature_missing_plot)
Entire TMT plexes share identical missingness because each
multiplexed run experiences the same loading, digestion, and acquisition
depth. After expanding the few multi-plex labels (e.g.,
02|28) back to their constituent runs, nearly every plex
reports nine CCLE lanes (one channel remains the pooled bridge) and all
samples within that plex converge to the same missing percentage. Plex
10, 41, and 39 sit at the top of the table because those specific
batches were slightly shallower, so the sparse signals show up as a
plex-level effect rather than isolated sample failures.
if (!exists("compute_cv_by_group")) {
compute_cv_by_group <- function(
mat,
groups,
min_group_size = 3,
feature_missing_cutoff = 0.5
) {
if (!is.matrix(mat)) {
mat <- as.matrix(mat)
}
valid_features <- rowMeans(is.na(mat)) <= feature_missing_cutoff
mat <- mat[valid_features, , drop = FALSE]
groups <- as.character(groups)
valid_levels <- names(which(table(groups) >= min_group_size))
purrr::map_dfr(
valid_levels,
function(level) {
idx <- which(groups == level)
sub_mat <- mat[, idx, drop = FALSE]
if (ncol(sub_mat) < min_group_size) {
return(NULL)
}
retain <- rowMeans(is.na(sub_mat)) <= feature_missing_cutoff
sub_mat <- sub_mat[retain, , drop = FALSE]
if (!nrow(sub_mat)) {
return(NULL)
}
linear_values <- 2^sub_mat
row_means <- matrixStats::rowMeans2(linear_values, na.rm = TRUE)
row_sds <- matrixStats::rowSds(linear_values, na.rm = TRUE)
cv <- (row_sds / (row_means + 1e-6)) * 100
tibble(
tissue = level,
feature_id = rownames(sub_mat),
cv = cv
)
}
)
}
}
mass_spec_cv <- compute_cv_by_group(
mass_spec_exprs,
groups = mass_spec_samples$tissue,
min_group_size = 3,
feature_missing_cutoff = 0.5
)
if (nrow(mass_spec_cv)) {
cv_summary <- mass_spec_cv |>
group_by(tissue) |>
summarise(
median_cv = median(cv, na.rm = TRUE),
iqr_cv = IQR(cv, na.rm = TRUE),
features = n()
) |>
arrange(desc(median_cv)) |>
mutate(
median_cv = round(median_cv, 2),
iqr_cv = round(iqr_cv, 2)
)
DT::datatable(
cv_summary,
colnames = c("Tissue", "Median CV (%)", "IQR", "Features"),
caption = "Coefficient of variation across tissues (mass spec; computed on 2^intensity scale).",
options = list(pageLength = 10, scrollX = TRUE)
)
cv_boxplot_df <- mass_spec_cv |>
mutate(
tissue = fct_reorder(tissue, cv, .fun = median, .desc = TRUE)
)
cv_boxplot_cols <- tissue_palette(cv_boxplot_df$tissue)
cv_boxplot_df |>
ggplot(aes(x = tissue, y = cv, fill = tissue)) +
geom_boxplot(outlier.alpha = 0.1, show.legend = FALSE) +
coord_flip() +
labs(
title = "Mass spec CV distribution by tissue",
x = "Tissue (≥3 samples)",
y = "Coefficient of variation (%)"
) +
scale_fill_manual(values = cv_boxplot_cols, na.translate = TRUE) +
theme_minimal(base_size = 12)
}
The tissue CV analysis computes the coefficient of variation (standard deviation divided by mean, on the linear 2^intensity scale) for each protein within each tissue cohort.
Tissue-specific median CVs cluster below 20%, consistent with the expectation for well-controlled TMT workflows. Hematopoietic lineages show modestly higher dispersion, matching the increased phenotypic heterogeneity noted in the Gygi manuscript.
The CV boxplot uses ggplot’s default Tukey rule: the box spans the interquartile range (25th–75th percentile), whiskers extend to the furthest points within 1.5×IQR of the quartiles, and any sample whose CV falls beyond those whiskers is drawn as an outlier point. The long CV outlier tails stem from proteins whose mean abundance is near zero; even minor absolute fluctuations yield disproportionately high CV% once the denominator collapses. These usually represent low-signal proteins rather than true instability.
if (nrow(mass_spec_cv)) {
if (!requireNamespace("ggridges", quietly = TRUE)) {
stop("Package 'ggridges' is required for ridge density plots.")
}
cv_ridge_df <- mass_spec_cv |>
mutate(
tissue = fct_reorder(tissue, cv, .fun = median, .desc = TRUE)
)
cv_ridge_cols <- tissue_palette(cv_ridge_df$tissue)
cv_ridge_df |>
ggplot(aes(x = cv, y = tissue, fill = tissue)) +
ggridges::geom_density_ridges(
quantile_lines = TRUE,
quantiles = 2,
alpha = 0.8,
color = NA,
scale = 1.05
) +
labs(
title = "Mass spec CV densities by tissue",
x = "Coefficient of variation (%)",
y = NULL
) +
scale_fill_manual(values = cv_ridge_cols, guide = "none") +
theme_minimal(base_size = 12)
}
Ridge plot for supplemental visualization.
These plots confirm whether proteomic quantification remains centered and whether the assay retains the expected dynamic range across the CCLE diversity.
sample_medians <- tibble(
sampleid = colnames(mass_spec_exprs),
median_expr = matrixStats::colMedians(mass_spec_exprs, na.rm = TRUE),
intensity_iqr = matrixStats::colIQRs(mass_spec_exprs, na.rm = TRUE)
) |>
left_join(mass_spec_samples, by = "sampleid")
intensity_hist <- mass_spec_exprs |>
as.vector() |>
tibble(intensity = _) |>
filter(!is.na(intensity)) |>
ggplot(aes(x = intensity)) +
geom_histogram(fill = col_mass_hist, color = "white", bins = 80) +
labs(
title = "Distribution of log2 intensities (mass spec)",
x = "log2 intensity",
y = "Observations"
) +
scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
theme_minimal(base_size = 12)
print(intensity_hist)
The full-distribution view shows that a handful of measurements sit outside the core [-5, 5] range, stretching the x-axis well beyond where the vast majority of intensities lie.
clean_mass_spec <- mass_spec_exprs
mass_spec_out_mask <- clean_mass_spec < -5 | clean_mass_spec > 5
mass_spec_total_vals <- length(clean_mass_spec)
mass_spec_out_count <- sum(mass_spec_out_mask, na.rm = TRUE)
mass_spec_out_pct <- (mass_spec_out_count / mass_spec_total_vals) * 100
cat(sprintf(
"Mass spec intensities outside [-5,5]: %d of %d (%.4f%%)\n",
mass_spec_out_count,
mass_spec_total_vals,
mass_spec_out_pct
))
## Mass spec intensities outside [-5,5]: 7617 of 4770370 (0.1597%)
clean_mass_spec[mass_spec_out_mask] <- NA
clean_intensity_hist <- clean_mass_spec |>
as.vector() |>
tibble(intensity = _) |>
filter(!is.na(intensity)) |>
ggplot(aes(x = intensity)) +
geom_histogram(fill = col_mass_hist_trim, color = "white", bins = 80) +
labs(
title = "Mass spec log2 intensities ([-5, 5] censored)",
x = "log2 intensity",
y = "Observations"
) +
coord_cartesian(xlim = c(-5, 5)) +
scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
theme_minimal(base_size = 12)
print(clean_intensity_hist)
Once the tails are clipped to [-5, 5], the histogram is tightly centered near 0 with no visible skew.
if (!exists("summarize_dynamic_range")) {
summarize_dynamic_range <- function(mat) {
feature_medians <- matrixStats::rowMedians(mat, na.rm = TRUE)
tibble(
feature_id = rownames(mat),
median_intensity = feature_medians
) |>
arrange(desc(median_intensity)) |>
mutate(rank = row_number())
}
}
mass_spec_dynamic <- summarize_dynamic_range(mass_spec_exprs)
ggplot(mass_spec_dynamic, aes(x = rank, y = median_intensity)) +
geom_line(color = col_mass_dynamic) +
labs(
title = "Mass spec protein rank-abundance curve",
x = "Rank (most to least abundant)",
y = "Median log2 intensity"
) +
theme_minimal(base_size = 12)
The curve shows three regimes: a small set of highly abundant proteins, a mid-range plateau, and a tail dropoff only at the detection limit. That indicates the assay maintains depth across most protein ranks and only compresses at the very low end—exactly what we expect from well-normalized TMT data.
tissue_cols_mass <- tissue_palette(sample_medians$tissue)
sample_medians |>
arrange(median_expr) |>
mutate(sampleid = factor(sampleid, levels = sampleid)) |>
ggplot(aes(x = sampleid, y = median_expr, fill = tissue)) +
geom_col(width = 0.6) +
coord_flip() +
labs(
title = "Sample medians coloured by tissue",
x = "Sample",
y = "Median log2 intensity"
) +
scale_fill_manual(values = tissue_cols_mass, na.translate = TRUE) +
theme_minimal(base_size = 11) +
theme(
legend.position = "none",
axis.text.y = element_text(size = 5),
axis.text.x = element_text(size = 9),
plot.margin = margin(5, 12, 5, 5)
)
Notably, hematopoietic profiles dominate the lower tail of this plot with their sample medians cluster around -0.4 to -0.6 log2, reflecting the consistently lower overall signal in immune-derived lines. Kidney and upper aerodigestive cohorts also sit below the global center, whereas large intestine, endometrium, and some CNS lines anchor the upper end (≈0.1 log2). The spread is still narrow (5th–95th percentiles span roughly -0.3 to 0.07), so these shifts capture subtle but reproducible tissue-specific baselines rather than dramatic normalization failures.
The correlation heatmap and associated PCA provide complementary views on replicate agreement and potential batch effects.
if (!exists("median_impute")) {
median_impute <- function(mat) {
if (!is.matrix(mat)) {
mat <- as.matrix(mat)
}
imputed <- mat
row_medians <- matrixStats::rowMedians(mat, na.rm = TRUE)
row_medians[is.na(row_medians) | is.infinite(row_medians)] <- 0
na_idx <- which(is.na(mat), arr.ind = TRUE)
if (nrow(na_idx)) {
imputed[na_idx] <- row_medians[na_idx[, 1]]
}
imputed
}
}
if (!exists("prepare_correlation")) {
prepare_correlation <- function(
mat,
sample_meta,
missing_threshold = 0.25
) {
if (!is.matrix(mat)) {
mat <- as.matrix(mat)
}
keep_features <- rowMeans(is.na(mat)) <= missing_threshold
mat <- mat[keep_features, , drop = FALSE]
mat <- median_impute(mat)
zero_var <- matrixStats::rowVars(mat) > 0
mat <- mat[zero_var, , drop = FALSE]
if (nrow(mat) == 0) {
return(NULL)
}
corr_mat <- cor(mat, method = "pearson")
corr_df <- as.data.frame(corr_mat) |>
tibble::rownames_to_column("sampleid") |>
pivot_longer(
cols = -sampleid,
names_to = "sampleid_b",
values_to = "correlation"
)
annotated <- sample_meta |>
select(sampleid, tissue) |>
mutate(
tissue = if_else(is.na(tissue) | tissue == "", "unspecified", tissue)
)
corr_df |>
left_join(annotated, by = "sampleid") |>
left_join(
annotated,
by = c("sampleid_b" = "sampleid"),
suffix = c("_a", "_b")
)
}
}
mass_spec_cor_df <- prepare_correlation(
mass_spec_exprs,
mass_spec_samples,
missing_threshold = 0.25
)
if (!is.null(mass_spec_cor_df)) {
cor_heatmap_mat <- mass_spec_cor_df |>
pivot_wider(
id_cols = sampleid,
names_from = sampleid_b,
values_from = correlation
) |>
column_to_rownames("sampleid") |>
as.matrix()
ordered_samples <- colnames(cor_heatmap_mat)
annotation_df <- mass_spec_samples |>
mutate(
plex_id = if_else(is.na(plex_id) | plex_id == "", "unspecified", plex_id)
) |>
filter(sampleid %in% ordered_samples) |>
distinct(sampleid, .keep_all = TRUE) |>
arrange(match(sampleid, ordered_samples)) |>
select(sampleid, tissue, plex_id) |>
column_to_rownames("sampleid")
tissue_levels <- sort(unique(annotation_df$tissue))
plex_levels <- sort(unique(annotation_df$plex_id))
annotation_df$tissue <- factor(annotation_df$tissue, levels = tissue_levels)
annotation_df$plex_id <- factor(annotation_df$plex_id, levels = plex_levels)
tissue_colors <- tissue_palette(annotation_df$tissue)
plex_colors <- setNames(
rep(palette_extended, length.out = length(plex_levels)),
plex_levels
)
annotation_colors <- list(
tissue = tissue_colors,
plex_id = plex_colors
)
pheatmap::pheatmap(
cor_heatmap_mat,
color = corr_colors,
breaks = heatmap_breaks,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = annotation_df,
annotation_row = annotation_df,
annotation_colors = annotation_colors,
legend = TRUE,
annotation_legend = FALSE,
border_color = NA,
clustering_method = "complete",
main = "Sample-wise Pearson correlations (mass spec)",
silent = FALSE
)
}
This is a symmetric sample–sample self-correlation matrix (each cell is the Pearson correlation between two CCLE proteomic profiles; red diagonal is 1.0 self-correlation). Consistent with the earlier plots, hematopoietic and lymphoid lines cluster tightly both in the dendrogram and as a high-correlation block, while no plex-driven blocks are evident—pointing to biology, not batch, as the dominant structure.
if (!is.null(mass_spec_cor_df)) {
mass_spec_cor_df |>
filter(sampleid != sampleid_b) |>
ggplot(aes(x = correlation)) +
geom_histogram(fill = col_mass_corr_hist, color = "white", bins = 40) +
labs(
title = "Distribution of sample-wise correlations (mass spec)",
x = "Pearson correlation",
y = "Pairs"
) +
theme_minimal(base_size = 12)
}
Most pairwise correlations sit around zero, reflecting the expected lack of similarity between unrelated tissues. A minute right skew corresponds to biologically similar samples.
if (!exists("generate_pca")) {
generate_pca <- function(mat, sample_meta, missing_threshold = 0.25) {
if (!is.matrix(mat)) {
mat <- as.matrix(mat)
}
keep_features <- rowMeans(is.na(mat)) <= missing_threshold
mat <- mat[keep_features, , drop = FALSE]
mat <- median_impute(mat)
zero_var <- matrixStats::rowVars(mat) > 0
mat <- mat[zero_var, , drop = FALSE]
if (nrow(mat) == 0) {
return(NULL)
}
pca <- prcomp(t(mat), center = TRUE, scale. = FALSE)
scores <- as_tibble(pca$x) |>
mutate(sampleid = rownames(pca$x)) |>
left_join(sample_meta, by = "sampleid")
list(
scores = scores,
pca = pca
)
}
}
mass_spec_pca <- generate_pca(
mass_spec_exprs,
mass_spec_samples,
missing_threshold = 0.25
)
if (!is.null(mass_spec_pca)) {
expl_var <- (mass_spec_pca$pca$sdev^2) / sum(mass_spec_pca$pca$sdev^2)
var_labels <- paste0(
names(expl_var),
" (",
round(expl_var * 100, 1),
"%)"
)
legend_rows_ms <- ceiling(length(unique(mass_spec_samples$tissue)) / 6)
pca_tissue_cols <- tissue_palette(mass_spec_pca$scores$tissue)
mass_spec_pca$scores |>
ggplot(aes(x = PC1, y = PC2, color = tissue)) +
geom_point(alpha = 0.8, size = 2) +
labs(
title = "Mass spec PCA (features filtered to ≤25% missing)",
x = var_labels[1],
y = var_labels[2]
) +
coord_equal() +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
legend.title = element_text(size = 9),
legend.text = element_text(size = 7),
legend.box = "horizontal",
legend.margin = margin(t = 4, b = 4)
) +
scale_color_manual(values = pca_tissue_cols, na.translate = TRUE) +
guides(
color = guide_legend(
nrow = max(2, legend_rows_ms),
byrow = TRUE,
override.aes = list(size = 3, alpha = 1)
)
)
}
Hematopoietic and lymphoid samples pull away from solid tumors along PC1, as expected, while the rest mix uniformly; so tissue drives the visible structure rather than technical factors.
if (!is.null(mass_spec_pca)) {
legend_rows_plex <- ceiling(length(unique(mass_spec_samples$plex_id)) / 6)
plex_ids <- sort(unique(if_else(
is.na(mass_spec_samples$plex_id) | mass_spec_samples$plex_id == "",
"unspecified",
mass_spec_samples$plex_id
)))
plex_colors <- setNames(
rep(palette_extended, length.out = length(plex_ids)),
plex_ids
)
mass_spec_pca$scores |>
mutate(
plex_id = if_else(is.na(plex_id) | plex_id == "", "unspecified", plex_id)
) |>
ggplot(aes(x = PC1, y = PC2, color = plex_id)) +
geom_point(alpha = 0.8, size = 2) +
labs(
title = "Mass spec PCA coloured by TMT plex",
x = var_labels[1],
y = var_labels[2]
) +
coord_equal() +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
legend.title = element_text(size = 9),
legend.text = element_text(size = 7),
legend.box = "horizontal",
legend.margin = margin(t = 4, b = 4)
) +
scale_color_manual(values = plex_colors, na.translate = TRUE) +
guides(
color = guide_legend(
nrow = max(2, legend_rows_plex),
byrow = TRUE,
override.aes = list(size = 3, alpha = 1)
)
)
}
Coloring by plex shows no discernible structure.
if (!is.null(rna_tpm)) {
rna_expr <- SummarizedExperiment::assay(rna_tpm, "exprs")
rna_feature_meta <- SummarizedExperiment::rowData(rna_tpm) |>
as.data.frame() |>
as_tibble() |>
mutate(
gene_symbol = coalesce(
as.character(gene_name),
as.character(gene_id)
)
)
common_samples <- intersect(colnames(mass_spec_exprs), colnames(rna_expr))
if (length(common_samples) >= 10) {
mass_spec_common <- mass_spec_exprs[, common_samples, drop = FALSE]
rna_common <- rna_expr[, common_samples, drop = FALSE]
mass_spec_gene_map <- mass_spec_feature_meta |>
transmute(
feature_id = feature_id,
gene_symbol = str_trim(Gene_Symbol)
) |>
filter(!is.na(gene_symbol) & gene_symbol != "")
mass_spec_gene_expr <- rowsum(
mass_spec_common[mass_spec_gene_map$feature_id, , drop = FALSE],
group = mass_spec_gene_map$gene_symbol,
reorder = FALSE
)
rna_gene_symbols <- str_trim(rna_feature_meta$gene_symbol)
valid_rna <- !is.na(rna_gene_symbols) & rna_gene_symbols != ""
rna_gene_expr <- rowsum(
rna_common[valid_rna, , drop = FALSE],
group = rna_gene_symbols[valid_rna],
reorder = FALSE
)
shared_genes <- intersect(
rownames(mass_spec_gene_expr),
rownames(rna_gene_expr)
)
if (length(shared_genes) >= 100) {
mass_spec_shared <- mass_spec_gene_expr[shared_genes, , drop = FALSE]
rna_shared <- rna_gene_expr[shared_genes, , drop = FALSE]
sample_ids <- colnames(mass_spec_shared)
sample_cor <- purrr::map_dbl(
sample_ids,
~ suppressWarnings(cor(
mass_spec_shared[, .x],
rna_shared[, .x],
method = "spearman",
use = "pairwise.complete.obs"
))
)
spearman_summary <- summary(sample_cor)
DT::datatable(
spearman_summary |>
enframe(name = "statistic", value = "value") |>
mutate(value = round(value, 3)),
colnames = c("Statistic", "Value"),
caption = "RNA–protein Spearman correlation summary.",
options = list(dom = 't')
)
tibble(
sampleid = sample_ids,
spearman = sample_cor
) |>
left_join(mass_spec_samples, by = "sampleid") |>
ggplot(aes(x = spearman)) +
geom_histogram(
fill = col_mass_spearman_hist,
color = "white",
bins = 25
) +
labs(
title = "Spearman correlation between RNA and protein abundance",
x = "Spearman rho",
y = "Samples"
) +
theme_minimal(base_size = 12)
}
}
}
RNA–protein concordance centers near 0.30 (IQR ≈ 0.25–0.35): solid evidence that the proteomics agree with matched transcriptomes, while the spread reminds us proteins still capture post-transcriptional biology beyond RNA. That magnitude is typical for CCLE’s cross-modality comparisons. Even though both assays profile the same lines, they come from distinct experimental pipelines and sources, so perfect alignment is not expected.
rppa_samples <- rppa_sample_meta |>
mutate(across(everything(), as.character)) |>
select(
sampleid,
depmap_id,
cell_line,
tissue,
dataset_sample_id,
batchid
) |>
left_join(
cell_meta |>
select(cellid, CCLE.sampleid, CCLE.name, CCLE.site_Primary, CCLE.type),
by = c("sampleid" = "cellid")
) |>
mutate(
ccle_sampleid = coalesce(CCLE.sampleid, sampleid),
dataset_match = dataset_sample_id == ccle_sampleid,
tissue = if_else(is.na(tissue) | tissue == "", "unspecified", tissue)
)
rppa_alignment <- rppa_samples |>
summarise(
samples = n(),
depmap_missing = sum(is.na(depmap_id)),
dataset_mismatch = sum(!dataset_match)
)
DT::datatable(
rppa_alignment,
caption = "RPPA sample alignment diagnostics.",
options = list(dom = 't')
)
Only two TCPA identifiers diverge from CCLE naming conventions and are corrected during curation, yielding zero residual mismatches in the curated PharmacoSet.
dataset_mismatches <- rppa_samples |>
filter(!dataset_match) |>
arrange(sampleid)
if (nrow(dataset_mismatches) > 0) {
mismatch_long <- dataset_mismatches |>
select(
sampleid,
ccle_sampleid,
dataset_sample_id,
depmap_id,
cell_line,
tissue,
CCLE.name,
CCLE.site_Primary,
CCLE.type
) |>
mutate(
sampleid = if_else(
is.na(sampleid) | sampleid == "",
dataset_sample_id,
sampleid
)
) |>
pivot_longer(
cols = -sampleid,
names_to = "field",
values_to = "value"
) |>
pivot_wider(
names_from = sampleid,
values_from = value
) |>
arrange(field)
DT::datatable(
mismatch_long,
colnames = c("Metadata field", colnames(mismatch_long)[-1]),
caption = "Metadata for samples where the TCPA identifier differed from the CCLE sample identifier.",
options = list(scrollX = TRUE)
)
}
The two discrepancies arise from TCPA exporting tissue-qualified
identifiers that do not follow CCLE conventions
(KE97_STOMACH instead of
KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,
NCIH684_LARGE_INTESTINE instead of
NCIH684_LIVER). During PharmacoSet curation we remap the
TCPA identifiers to the canonical CCLE sample IDs shown in the
ccle_sampleid column, ensuring downstream joins and QC
reference the correct lines and tissues.
rppa_tissue_counts <- rppa_samples |>
count(tissue, name = "n") |>
arrange(desc(n))
rppa_tissue_cols_plot <- tissue_palette(rppa_tissue_counts$tissue)
if ("unspecified" %in% names(rppa_tissue_cols_plot)) {
rppa_tissue_cols_plot["unspecified"] <- na_color
}
ggplot(rppa_tissue_counts, aes(x = reorder(tissue, n), y = n, fill = tissue)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "RPPA sample composition by tissue",
x = "Tissue",
y = "Samples"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
scale_fill_manual(values = rppa_tissue_cols_plot, na.translate = TRUE) +
theme_minimal(base_size = 12)
rppa_depth <- tibble(
metric = c(
"Antibody probes quantified",
"Phosphoprotein probes",
"Protein probes"
),
value = c(
nrow(rppa_exprs),
sum(rppa_feature_meta$is_phospho),
sum(!rppa_feature_meta$is_phospho)
)
)
DT::datatable(
rppa_depth,
colnames = c("Metric", "Value"),
caption = "RPPA quantification depth.",
options = list(dom = 't')
)
rppa_feature_meta |>
mutate(
has_gene = !is.na(gene_symbol) & gene_symbol != "",
feature_class = if_else(is_phospho, "phosphoprotein", "protein"),
annotation = if_else(has_gene, "mapped", "unmapped")
) |>
count(feature_class, annotation, name = "features") |>
DT::datatable(
colnames = c("Feature class", "Annotation", "Features"),
caption = "RPPA feature annotation summary.",
options = list(dom = 't')
)
We are able to map roughly two thirds of probes to known gene symbols
using org.Hs.eg.db in the curation.
TCPA intensities are usually complete; the table confirms whether any antibodies or plates deviate from that expectation.
rppa_missing <- compute_missingness(rppa_exprs)
rppa_missing_summary <- tibble(
metric = c(
"Sample-level max",
"Sample-level mean",
"Feature-level max",
"Feature-level mean",
"Overall dataset missingness"
),
missing_pct = c(
max(rppa_missing$sample$missing_pct),
mean(rppa_missing$sample$missing_pct),
max(rppa_missing$feature$missing_pct),
mean(rppa_missing$feature$missing_pct),
rppa_missing$dataset$missing_pct
)
) |>
mutate(missing_pct = round(missing_pct, 2))
DT::datatable(
rppa_missing_summary,
colnames = c("Metric", "Missing (%)"),
caption = "RPPA missingness summary at the sample, feature, and dataset levels.",
options = list(dom = 't')
)
All reported statistics evaluate to 0%; no missing data.
rppa_cv <- compute_cv_by_group(
rppa_exprs,
groups = rppa_samples$tissue,
min_group_size = 5,
feature_missing_cutoff = 0.5
)
if (nrow(rppa_cv)) {
rppa_cv_summary <- rppa_cv |>
group_by(tissue) |>
summarise(
median_cv = median(cv, na.rm = TRUE),
iqr_cv = IQR(cv, na.rm = TRUE),
features = n()
) |>
arrange(desc(median_cv)) |>
mutate(
median_cv = round(median_cv, 2),
iqr_cv = round(iqr_cv, 2)
)
DT::datatable(
rppa_cv_summary,
colnames = c("Tissue", "Median CV (%)", "IQR", "Features"),
caption = "Coefficient of variation across tissues (RPPA; computed on 2^intensity scale).",
options = list(pageLength = 10, scrollX = TRUE)
)
rppa_cv_boxplot_df <- rppa_cv |>
mutate(
tissue = fct_reorder(tissue, cv, .fun = median, .desc = TRUE)
)
rppa_cv_boxplot_cols <- tissue_palette(rppa_cv_boxplot_df$tissue)
rppa_cv_boxplot_df |>
ggplot(aes(x = tissue, y = cv, fill = tissue)) +
geom_boxplot(outlier.alpha = 0.1, show.legend = FALSE) +
coord_flip() +
labs(
title = "RPPA CV distribution by tissue",
x = "Tissue (≥5 samples)",
y = "Coefficient of variation (%)"
) +
scale_fill_manual(values = rppa_cv_boxplot_cols, na.translate = TRUE) +
theme_minimal(base_size = 12)
}
RPPA tissues show median CVs clustered below ~18%, mirroring the mass-spec patterns and indicating stable antibody performance after normalization.
The boxplot retains the same Tukey whisker rules described above: outliers beyond 1.5×IQR generally correspond to phospho-probes near the detection floor, where tiny absolute shifts lead to large percent fluctuations.
if (nrow(rppa_cv)) {
if (!requireNamespace("ggridges", quietly = TRUE)) {
stop("Package 'ggridges' is required for ridge density plots.")
}
rppa_cv_ridge_df <- rppa_cv |>
mutate(
tissue = fct_reorder(tissue, cv, .fun = median, .desc = TRUE)
)
rppa_cv_ridge_cols <- tissue_palette(rppa_cv_ridge_df$tissue)
rppa_cv_ridge_df |>
ggplot(aes(x = cv, y = tissue, fill = tissue)) +
ggridges::geom_density_ridges(
quantile_lines = TRUE,
quantiles = 2,
alpha = 0.8,
color = NA,
scale = 1.05
) +
labs(
title = "RPPA CV densities by tissue",
x = "Coefficient of variation (%)",
y = NULL
) +
scale_fill_manual(values = rppa_cv_ridge_cols, guide = "none") +
theme_minimal(base_size = 12)
}
Ridge densities provide the same trend, with insight into finer densities.
rppa_sample_medians <- tibble(
sampleid = colnames(rppa_exprs),
median_expr = matrixStats::colMedians(rppa_exprs, na.rm = TRUE)
) |>
left_join(rppa_samples, by = "sampleid")
rppa_hist <- rppa_exprs |>
as.vector() |>
tibble(intensity = _) |>
filter(!is.na(intensity)) |>
ggplot(aes(x = intensity)) +
geom_histogram(fill = col_rppa_hist, color = "white", bins = 80) +
labs(
title = "Distribution of RPPA log2 intensities",
x = "log2 intensity",
y = "Observations"
) +
scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
theme_minimal(base_size = 12)
print(rppa_hist)
The same long-range axis inflation happens here as with mass spec: a small number of probes sit just outside [-5, 5], making the main density look compressed.
rppa_trimmed <- rppa_exprs
rppa_out_mask <- rppa_trimmed < -5 | rppa_trimmed > 5
rppa_total_vals <- length(rppa_trimmed)
rppa_out_count <- sum(rppa_out_mask, na.rm = TRUE)
rppa_out_pct <- (rppa_out_count / rppa_total_vals) * 100
cat(sprintf(
"RPPA intensities outside [-5,5]: %d of %d (%.4f%%)\n",
rppa_out_count,
rppa_total_vals,
rppa_out_pct
))
## RPPA intensities outside [-5,5]: 755 of 392019 (0.1926%)
rppa_trimmed[rppa_out_mask] <- NA
rppa_hist_trimmed <- rppa_trimmed |>
as.vector() |>
tibble(intensity = _) |>
filter(!is.na(intensity)) |>
ggplot(aes(x = intensity)) +
geom_histogram(fill = col_rppa_hist_trim, color = "white", bins = 80) +
labs(
title = "RPPA log2 intensities ([-5, 5] censored)",
x = "log2 intensity",
y = "Observations"
) +
coord_cartesian(xlim = c(-5, 5)) +
scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
theme_minimal(base_size = 12)
print(rppa_hist_trimmed)
The censored view confirms the RPPA intensities are also well centered around 0 with only negligible tails.
rppa_dynamic <- summarize_dynamic_range(rppa_exprs)
ggplot(rppa_dynamic, aes(x = rank, y = median_intensity)) +
geom_line(color = col_rppa_dynamic) +
labs(
title = "RPPA rank-abundance curve",
x = "Rank (most to least abundant)",
y = "Median log2 intensity"
) +
theme_minimal(base_size = 12)
The RPPA curve shows a gradual decay through roughly the top 100 probes, a flat shoulder from ranks ~100–400, and then a sharper drop toward the detection floor. That shape mirrors expectations for an antibody panel that mixes a few high-signal housekeeping targets with a large mid-level cohort and a short low-abundance tail. No abrupt cliffs that would hint at failed probes or normalization issues.
rppa_tissue_cols <- tissue_palette(rppa_sample_medians$tissue)
rppa_sample_medians |>
arrange(median_expr) |>
mutate(sampleid = factor(sampleid, levels = sampleid)) |>
ggplot(aes(x = sampleid, y = median_expr, fill = tissue)) +
geom_col(width = 0.5) +
coord_flip() +
labs(
title = "RPPA sample medians coloured by tissue",
x = "Sample",
y = "Median log2 intensity"
) +
scale_fill_manual(values = rppa_tissue_cols, na.translate = TRUE) +
theme_minimal(base_size = 11) +
theme(
legend.position = "none",
axis.text.y = element_text(size = 4),
axis.text.x = element_text(size = 9),
plot.margin = margin(5, 14, 5, 5)
)
RPPA sample medians span roughly -0.15 to 0.16 log2 overall (5th–95th percentiles ≈ -0.05 to 0.07). Autonomic ganglia, endometrium, and liver occupy the lower end of the plot, while urinary tract, breast, kidney, skin, and pleura cohorts sit slightly above zero. The narrow spread reinforces that global RPPA scaling is consistent across tissues, making it easy to spot any future batches that drift away from the central band.
rppa_cor_df <- prepare_correlation(
rppa_exprs,
rppa_samples,
missing_threshold = 0.2
)
if (!is.null(rppa_cor_df)) {
cor_heatmap_mat <- rppa_cor_df |>
pivot_wider(
id_cols = sampleid,
names_from = sampleid_b,
values_from = correlation
) |>
column_to_rownames("sampleid") |>
as.matrix()
ordered_samples <- colnames(cor_heatmap_mat)
annotation_df <- rppa_samples |>
mutate(
batchid = if_else(is.na(batchid) | batchid == "", "unspecified", batchid)
) |>
filter(sampleid %in% ordered_samples) |>
distinct(sampleid, .keep_all = TRUE) |>
arrange(match(sampleid, ordered_samples)) |>
select(sampleid, tissue) |>
column_to_rownames("sampleid")
tissue_levels <- sort(unique(annotation_df$tissue))
annotation_df$tissue <- factor(annotation_df$tissue, levels = tissue_levels)
tissue_colors <- tissue_palette(annotation_df$tissue)
annotation_colors <- list(
tissue = tissue_colors
)
pheatmap::pheatmap(
cor_heatmap_mat,
color = corr_colors,
breaks = heatmap_breaks,
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = annotation_df,
annotation_row = annotation_df,
annotation_colors = annotation_colors,
legend = TRUE,
annotation_legend = FALSE,
border_color = NA,
clustering_method = "complete",
main = "Sample-wise Pearson correlations (RPPA)",
silent = FALSE
)
}
The heatmap clusters pairwise sample correlations and overlays tissue annotations along both axes. The red diagonal represents sample self-correlation (all exactly 1), while the flanking blocks show tissues with shared signatures: hematopoietic/lymphoid, CNS, and skin lines each form tight clusters. Nearly every off-diagonal entry remains positive, reinforcing that RPPA profiles are broadly concordant across samples.
if (!is.null(rppa_cor_df)) {
rppa_cor_summary <- rppa_cor_df |>
filter(sampleid != sampleid_b) |>
summarise(
min = min(correlation, na.rm = TRUE),
q1 = quantile(correlation, 0.25, na.rm = TRUE),
median = median(correlation, na.rm = TRUE),
mean = mean(correlation, na.rm = TRUE),
q3 = quantile(correlation, 0.75, na.rm = TRUE),
max = max(correlation, na.rm = TRUE)
) |>
pivot_longer(everything(), names_to = "statistic", values_to = "value") |>
mutate(value = round(value, 3))
rppa_cor_df |>
filter(sampleid != sampleid_b) |>
ggplot(aes(x = correlation)) +
geom_histogram(fill = col_rppa_corr_hist, color = "white", bins = 40) +
labs(
title = "Distribution of sample-wise correlations (RPPA)",
x = "Pearson correlation",
y = "Pairs"
) +
theme_minimal(base_size = 12)
DT::datatable(
rppa_cor_summary,
colnames = c("Statistic", "Value"),
caption = "Summary statistics for RPPA sample-wise correlations.",
options = list(dom = 't')
)
}
The histogram traces a smooth, near-normal bell that leans slightly to the right but stays almost entirely above zero. That overall positive trend indicates RPPA profiles are broadly concordant, with only a thin tail of near-identical pairs pushing correlations toward one.
rppa_pca <- generate_pca(
rppa_exprs,
rppa_samples,
missing_threshold = 0.2
)
if (!is.null(rppa_pca)) {
expl_var <- (rppa_pca$pca$sdev^2) / sum(rppa_pca$pca$sdev^2)
var_labels <- paste0(
names(expl_var),
" (",
round(expl_var * 100, 1),
"%)"
)
legend_rows_rppa <- ceiling(length(unique(rppa_samples$tissue)) / 6)
rppa_tissue_cols <- tissue_palette(rppa_pca$scores$tissue)
rppa_pca$scores |>
ggplot(aes(x = PC1, y = PC2, color = tissue)) +
geom_point(alpha = 0.8, size = 2) +
labs(
title = "RPPA PCA (features filtered to ≤20% missing)",
x = var_labels[1],
y = var_labels[2]
) +
coord_equal() +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
legend.title = element_text(size = 9),
legend.text = element_text(size = 7),
legend.box = "horizontal",
legend.margin = margin(t = 4, b = 4)
) +
scale_color_manual(values = rppa_tissue_cols, na.translate = TRUE) +
guides(
color = guide_legend(
nrow = max(2, legend_rows_rppa),
byrow = TRUE,
override.aes = list(size = 3, alpha = 1)
)
)
}
The PCA mirrors earlier plots: hematopoietic and lymphoid samples peel away from the epithelial core along the first two PCs.
if (!is.null(rna_tpm)) {
rna_genes <- SummarizedExperiment::rowData(rna_tpm) |>
as.data.frame() |>
as_tibble() |>
transmute(
gene_symbol = coalesce(
as.character(gene_name),
as.character(gene_id)
)
)
rppa_overlap <- tibble(
feature_id = rppa_feature_meta$feature_id,
gene_symbol = coalesce(
rppa_feature_meta$gene_symbol,
rppa_feature_meta$clean_id
)
) |>
filter(!is.na(gene_symbol) & gene_symbol != "") |>
mutate(in_rnaseq = gene_symbol %in% rna_genes$gene_symbol) |>
count(in_rnaseq, name = "genes") |>
mutate(
label = if_else(in_rnaseq, "present in RNA-seq", "absent from RNA-seq"),
proportion = round(genes / sum(genes), 3)
)
DT::datatable(
rppa_overlap,
colnames = c("In RNA-seq", "Genes", "Label", "Proportion"),
caption = "RNA-seq coverage for RPPA targets.",
options = list(dom = 't')
)
}
About 61% of RPPA targets map to genes represented in the RNA-seq panel, so downstream RNA–protein comparisons cover the majority of antibodies while flagging the 39% of probes lacking matched transcripts.
if (!is.null(rna_tpm)) {
rna_expr <- SummarizedExperiment::assay(rna_tpm, "exprs")
rna_feature_meta <- SummarizedExperiment::rowData(rna_tpm) |>
as.data.frame() |>
as_tibble() |>
mutate(
gene_symbol = str_trim(coalesce(
as.character(gene_name),
as.character(gene_id)
))
)
common_samples <- intersect(colnames(rppa_exprs), colnames(rna_expr))
if (length(common_samples) >= 10) {
rppa_common <- rppa_exprs[, common_samples, drop = FALSE]
rna_common <- rna_expr[, common_samples, drop = FALSE]
rppa_gene_map <- rppa_feature_meta |>
transmute(
feature_id = feature_id,
gene_symbol = str_trim(coalesce(gene_symbol, clean_id))
) |>
filter(!is.na(gene_symbol) & gene_symbol != "")
rppa_gene_expr <- rowsum(
rppa_common[rppa_gene_map$feature_id, , drop = FALSE],
group = rppa_gene_map$gene_symbol,
reorder = FALSE
)
rna_gene_symbols <- rna_feature_meta$gene_symbol
valid_rna <- !is.na(rna_gene_symbols) & rna_gene_symbols != ""
rna_gene_expr <- rowsum(
rna_common[valid_rna, , drop = FALSE],
group = rna_gene_symbols[valid_rna],
reorder = FALSE
)
shared_genes <- intersect(
rownames(rppa_gene_expr),
rownames(rna_gene_expr)
)
if (length(shared_genes) >= 50) {
rppa_shared <- rppa_gene_expr[shared_genes, , drop = FALSE]
rna_shared <- rna_gene_expr[shared_genes, , drop = FALSE]
sample_ids <- colnames(rppa_shared)
sample_cor <- purrr::map_dbl(
sample_ids,
~ suppressWarnings(cor(
rppa_shared[, .x],
rna_shared[, .x],
method = "spearman",
use = "pairwise.complete.obs"
))
)
spearman_summary <- summary(sample_cor)
DT::datatable(
spearman_summary |>
enframe(name = "statistic", value = "value") |>
mutate(value = round(value, 3)),
colnames = c("Statistic", "Value"),
caption = "RNA–RPPA Spearman correlation summary.",
options = list(dom = 't')
)
tibble(
sampleid = sample_ids,
spearman = sample_cor
) |>
left_join(rppa_samples, by = "sampleid") |>
ggplot(aes(x = spearman)) +
geom_histogram(
fill = col_rppa_corr_hist,
color = "white",
bins = 25
) +
labs(
title = "Spearman correlation between RNA and RPPA abundance",
x = "Spearman rho",
y = "Samples"
) +
theme_minimal(base_size = 12)
}
}
}
Spearman rho measures how well two profiles maintain rank order, so each histogram bar captures how closely a given cell line’s RPPA intensities track its RNA abundance across shared genes. The distribution stays positive with modest spread (centered around ~0.4), mirroring mass-spec: RPPA and RNA generally move together while still leaving room for protein-specific regulation.
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: CachyOS
##
## Matrix products: default
## BLAS: /usr/lib/libblas.so.3.12.0
## LAPACK: /usr/lib/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Toronto
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] UpSetR_1.4.0 pheatmap_1.0.13
## [3] rlang_1.1.6 paletteer_1.6.0
## [5] data.table_1.17.8 DT_0.34.0
## [7] patchwork_1.3.2 here_1.0.2
## [9] scales_1.4.0 lubridate_1.9.4
## [11] forcats_1.0.1 stringr_1.5.2
## [13] dplyr_1.1.4 purrr_1.2.0
## [15] readr_2.1.5 tidyr_1.3.1
## [17] tibble_3.3.0 ggplot2_4.0.0
## [19] tidyverse_2.0.0 MultiAssayExperiment_1.34.0
## [21] PharmacoGx_3.12.2 CoreGx_2.12.0
## [23] SummarizedExperiment_1.38.1 Biobase_2.68.0
## [25] GenomicRanges_1.60.0 GenomeInfoDb_1.44.3
## [27] IRanges_2.42.0 S4Vectors_0.46.0
## [29] MatrixGenerics_1.20.0 matrixStats_1.5.0
## [31] BiocGenerics_0.54.1 generics_0.1.4
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 gridExtra_2.3 rematch2_2.1.2
## [4] magrittr_2.0.4 shinydashboard_0.7.3 otel_0.2.0
## [7] ggridges_0.5.7 compiler_4.5.2 vctrs_0.6.5
## [10] reshape2_1.4.4 relations_0.6-15 pkgconfig_2.0.3
## [13] crayon_1.5.3 fastmap_1.2.0 backports_1.5.0
## [16] XVector_0.48.0 labeling_0.4.3 caTools_1.18.3
## [19] promises_1.5.0 rmarkdown_2.30 tzdb_0.5.0
## [22] UCSC.utils_1.4.0 coop_0.6-3 xfun_0.54
## [25] cachem_1.1.0 jsonlite_2.0.0 SnowballC_0.7.1
## [28] later_1.4.4 DelayedArray_0.34.1 scico_1.5.0
## [31] BiocParallel_1.42.2 parallel_4.5.2 sets_1.0-25
## [34] cluster_2.1.8.1 R6_2.6.1 bslib_0.9.0
## [37] stringi_1.8.7 RColorBrewer_1.1-3 limma_3.64.3
## [40] boot_1.3-32 jquerylib_0.1.4 Rcpp_1.1.0
## [43] knitr_1.50 downloader_0.4.1 BiocBaseUtils_1.10.0
## [46] timechange_0.3.0 httpuv_1.6.16 Matrix_1.7-4
## [49] igraph_2.2.1 tidyselect_1.2.1 abind_1.4-8
## [52] yaml_2.3.10 gplots_3.2.0 codetools_0.2-20
## [55] lattice_0.22-7 plyr_1.8.9 withr_3.0.2
## [58] shiny_1.11.1 BumpyMatrix_1.16.0 S7_0.2.0
## [61] evaluate_1.0.5 bench_1.1.4 pillar_1.11.1
## [64] lsa_0.73.3 KernSmooth_2.23-26 checkmate_2.3.3
## [67] shinyjs_2.1.0 piano_2.24.0 rprojroot_2.1.1
## [70] hms_1.1.4 gtools_3.9.5 xtable_1.8-4
## [73] marray_1.86.0 glue_1.8.0 slam_0.1-55
## [76] tools_4.5.2 fgsea_1.34.2 visNetwork_2.1.4
## [79] fastmatch_1.1-6 cowplot_1.2.0 crosstalk_1.2.2
## [82] GenomeInfoDbData_1.2.14 cli_3.6.5 S4Arrays_1.8.1
## [85] gtable_0.3.6 sass_0.4.10 digest_0.6.37
## [88] prismatic_1.1.2 SparseArray_1.8.1 htmlwidgets_1.6.4
## [91] farver_2.1.2 htmltools_0.5.8.1 lifecycle_1.0.4
## [94] httr_1.4.7 statmod_1.5.1 mime_0.13